Author: J. Hickman
# conda install -c conda-forge wikipedia
# conda install -c conda-forge wordcloud
# pip install wikipedia_sections
import matplotlib.pyplot as plt
import wikipedia
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
#RELOAD FILE AND PRETEND THAT IS OUR STARTING POINT
df=pd.read_csv('wiki-crawl-results.csv')
print(df.shape)
#CONVERT FROM STRING LABELS TO INTEGERS
labels=[]; #y1=[]; y2=[]
y1=[]
for label in df["label"]:
if label not in labels:
labels.append(label)
print("index =",len(labels)-1,": label =",label)
for i in range(0,len(labels)):
if(label==labels[i]):
y1.append(i)
y1=np.array(y1)
# CONVERT DF TO LIST OF STRINGS
corpus=df["text"].to_list()
y2=df["sentiment"].to_numpy()
print("number of text chunks = ",len(corpus))
print(corpus[0:3])
(5693, 3) index = 0 : label = pizza index = 1 : label = metallurgy index = 2 : label = basketball number of text chunks = 5693 ['word first appeared latin text town gaeta still part byzantine empire 997 ad text state tenant certain property give bishop gaeta duodecim pizze twelve every christmas day another twelve every easter sunday suggested etymology include byzantine greek late latin pitta cf', 'modern greek pitta bread apulia calabrian byzantine italy pitta round flat bread baked oven high temperature sometimes topping', 'word pitta turn traced either ancient greek pikte fermented pastry latin became picta ancient greek pissa attic pitta pitch tea bran bran bread']
# INITIALIZE COUNT VECTORIZER
# minDF = 0.01 means "ignore terms that appear in less than 1% of the documents".
# minDF = 5 means "ignore terms that appear in less than 5 documents".
vectorizer=CountVectorizer(min_df=0.001)
# RUN COUNT VECTORIZER ON OUR COURPUS
Xs = vectorizer.fit_transform(corpus)
X=np.array(Xs.todense())
#CONVERT TO ONE-HOT VECTORS
maxs=np.max(X,axis=0)
X=np.ceil(X/maxs)
# DOUBLE CHECK
print(X.shape,y1.shape,y2.shape)
(5693, 2693) (5693,) (5693,)
Distance between sentence vectors for a subset of data
num_rows_keep=250
index=np.sort(np.random.choice(X.shape[0], num_rows_keep, replace=False))
# print(y1[index])
#print(index)
tmp1=X[index, :]
# print(tmp1.shape,tmp1.dtype,tmp1[:,].shape)
#COMPUTE DISTANCE MATRIX
dij=[]
#LOOP OVER ROWS
for i in range(0,tmp1.shape[0]):
tmp2=[]
#LOOP OVER ROWS
for j in range(0,tmp1.shape[0]):
#EXTRACT VECTORS
vi=tmp1[i,:]
vj=tmp1[j,:]
#print(vi.shape,vj.shape)
#COMPUTE DISTANCES
dist=np.dot(vi, vj)/(np.linalg.norm(vi)*np.linalg.norm(vj)) #cosine sim
#dist=np.linalg.norm(vi-vj) #euclidean
# BUILD DISTANCE MATRIX
if(i==j or np.max(vi) == 0 or np.max(vj)==0):
tmp2.append(0)
else:
tmp2.append(dist)
dij.append(tmp2); #print(dij)
# raise
dij=np.array(dij)
#normalize
# dij=(dij-np.min(dij))/(np.max(dij)-np.min(dij))
#Lower triangle of an array.
# dij=np.sort(dij,axis=0)
# dij=np.sort(dij,axis=1)
# dij=np.tril(dij, k=-1)
import seaborn as sns
# sns.heatmap(np.exp(dij), annot=False) #, linewidths=.05)
sns.heatmap(dij, annot=False) #, linewidths=.05)
print(dij.shape)
print(dij)
(250, 250) [[0. 0.09128709 0. ... 0. 0. 0.12909944] [0.09128709 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] ... [0. 0. 0. ... 0. 0.1118034 0. ] [0. 0. 0. ... 0.1118034 0. 0.28867513] [0.12909944 0. 0. ... 0. 0.28867513 0. ]]
from sklearn.decomposition import PCA
# COMPUTE PCA WITH 10 COMPONENTS
pca = PCA(n_components=10)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
# GET PRINCIPLE COMPONENT PROJECTIONS
principal_components = pca.fit_transform(X)
df2 = pd.DataFrame(data = principal_components) #, columns = ['PC1','PC2','PC3','PC4','PC5'])
df3=pd.concat([df2,df['label']], axis=1)
# FIRST TWO COMPONENTS
sns.scatterplot(data=df2, x=0, y=1,hue=df["label"])
plt.show()
#3D PLOT
ax = plt.axes(projection='3d')
ax.scatter3D(df2[0], df2[1], df2[2], c=y1);
plt.show()
#PAIRPLOT
sns.pairplot(data=df3,hue="label") #.to_numpy()) #,hue=df["label"]) #, hue="time")
plt.show()
[0.01145308 0.00754586 0.00724713 0.00638084 0.00598309 0.00555053 0.00519602 0.005078 0.00502081 0.00491957] [25.62685355 20.80120448 20.38530534 19.12814789 18.52238528 17.84026552 17.26114092 17.06397622 16.96762047 16.79567683]